# Computations
import numpy as np
import pandas as pd
# scipy
import scipy.stats as stats
from scipy.stats import norm
# preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.utils.fixes import loguniform
from sklearn.ensemble import RandomForestClassifier
# Text
from colorama import Fore, Back, Style
import re
# Visualisation libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import missingno as msno
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from wordcloud import WordCloud
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
# sns setting
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
# plt setting
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we would like to predict customer churn for Telco Customer Churn data.
| Columns | Description |
|---|---|
| customerID | Customer ID |
| gender | Whether the customer is a male or a female |
| SeniorCitizen | Whether the customer is a senior citizen or not (1, 0) |
| Partner | Whether the customer has a partner or not (Yes, No) |
| Dependents | Whether the customer has dependents or not (Yes, No) |
| tenure | Number of months the customer has stayed with the company |
| PhoneService | Whether the customer has a phone service or not (Yes, No) |
| MultipleLines | Whether the customer has multiple lines or not (Yes, No, No phone service) |
| InternetService | Customer’s internet service provider (DSL, Fiber optic, No) |
| OnlineSecurity | Whether the customer has online security or not (Yes, No, No internet service) |
| OnlineBackup | Whether the customer has an online backup or not (Yes, No, No internet service) |
| DeviceProtection | Whether the customer has device protection or not (Yes, No, No internet service) |
| TechSupport | Whether the customer has tech support or not (Yes, No, No internet service) |
| StreamingTV | Whether the customer has streaming TV or not (Yes, No, No internet service) |
| StreamingMovies | Whether the customer has streaming movies or not (Yes, No, No internet service) |
| Contract | The contract term of the customer (Month-to-month, One year, Two years) |
| PaperlessBilling | Whether the customer has paperless billing or not (Yes, No) |
| PaymentMethod | The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)) |
| MonthlyCharges | The amount charged to the customer monthly |
| TotalCharges | The total amount charged to the customer |
| Churn | Whether the customer churned or not (Yes or No) |
Data = pd.read_csv('telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Columns'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Columns'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
return Out
_ = msno.bar(Data, figsize=(16,5), fontsize=14, log=False, color="#34495e")
def text_sep(txt): return re.sub(r"(\w)([A-Z])", r"\1 \2", txt)
def col_details(Col):
print(Back.BLACK + Fore.CYAN + Style.NORMAL + '%s:' % text_sep(Col))
print(Style.RESET_ALL)
print('%s' % ', '.join(Data[Col].unique()))
Data.rename(columns = {'gender':'Gender', 'tenure':'Tenure'}, inplace = True)
Data.columns = [text_sep(txt) for txt in Data.columns.tolist()]
Feature = 'Gender'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['aquamarine', 'steelblue']
SC = 'Navy'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color= SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
It can be seen that there is a balance between the two genders among churned customers.
Temp = Data.copy()
Temp['Senior Citizen'] = Temp['Senior Citizen'].map(lambda x: 'Yes' if x ==1 else 'No')
Feature = 'Senior Citizen'
Temp = Temp.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['greenyellow', 'seagreen']
SC = 'DarkGreen'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
It can be seen that only 25.5% of the senior customers were churned.
Feature = 'Partner'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['bisque', 'orange']
SC = 'DarkOrange'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
Over 64% of churned customers did not have any partners.
Feature = 'Dependents'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['pink', 'hotpink']
SC = 'DarkRed'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 220)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color=SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
Over 82 percent of churned customers did not have any dependents.
Feature = 'Tenure'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage',color_continuous_scale= 'ylgn', height= 450)
fig.show()
C = ['violet', 'mediumorchid']
fig = px.bar(Temp, x= Feature, y= 'Percentage',
color = 'Churn', text = 'Percentage', color_discrete_sequence= C, height= 500)
fig.update_traces(marker_line_color='Indigo', marker_line_width=1.2, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['yaxis'].update(range=[0, 10])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
del Feature
Customers with a higher tenure tend to churn less.
Feature = 'Contract'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['greenyellow', 'limeGreen','DarkGreen']
SC = 'DarkGreen'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 240)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.845, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
The majority of churned customers were on a month-to-month base contract.
Feature = 'Payment Method'
Temp = Data.groupby([Feature,'Churn'])[Feature].agg({'count'})
Temp['Percentage'] = np.round(100* Temp.values /Temp.sum().values, 2)
display(Temp)
C = ['azure','paleturquoise','steelblue','MidnightBlue']
SC = 'Navy'
Temp.reset_index(drop = False, inplace = True)
fig = px.bar(Temp, y= 'Churn', x= 'Percentage', orientation='h',
color = Feature, text = 'Percentage', color_discrete_sequence= C, height= 260)
fig.update_traces(marker_line_color=SC, marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Churn Percentage by Gender', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels= Temp.loc[Temp.Churn == 'Yes',Feature].values,
values= Temp.loc[Temp.Churn == 'Yes','count'].values,
name= 'Churn [Yes]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 1)
fig.add_trace(go.Pie(labels=Temp.loc[Temp.Churn == 'No', Feature].values,
values=Temp.loc[Temp.Churn == 'No','count'].values,
name= 'Churn [No]', textfont=dict(size=16),
marker=dict(colors = C, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.6, marker_line_color= SC, marker_line_width=1, opacity=1)
fig.update_layout(legend_title=Feature, font=dict(size=14), legend=dict(orientation="v"),
annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=16, showarrow=False),
dict(text='Not Churned', x=0.85, y=0.5, font_size=16, showarrow=False)], height = 400)
fig.show()
del Feature
Customers with an automatic payment method churned less.
Data_types = dtypes_group(Data)
display(Data_types)
Temp = Data_types.loc[Data_types.index == 'int64'].values[0,0]
Data[Temp] = Data[Temp].astype(int)
del Temp
Temp = Data_types.loc[Data_types.index == 'float64'].values[0,0]
Data[Temp] = Data[Temp].astype(float)
del Temp
Data['Total Charges'] = pd.to_numeric(Data['Total Charges'], errors='coerce')
First, let's convert all Yes/No columns using as follows
\begin{cases} 0 &\mbox{No}\\ 1 &\mbox{Yes}\end{cases}Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
if set(Data[i].unique().tolist()) == {'No', 'Yes'}:
Temp.append(i)
Data[Temp] = Data[Temp].replace({'Yes':1, 'No':0}).astype(int)
del Temp
However, some other columns can be converted similarly; however, we need to create a new feature.
Temp = []
for i in Data_types.loc[Data_types.index == 'object'].values[0,0]:
if set(Data[i].unique().tolist()) == {'No', 'No internet service', 'Yes'}:
Temp.append(i)
print('Columns: %s' %', '.join(Temp))
Note that,
col_details('Internet Service')
This Column can be coded as follows
$$\mbox{InternetServiceType} = \begin{cases} 0 &\mbox{No} \\ 1 &\mbox{DSL}\\ 2 &\mbox{Fiber optic}\end{cases}$$def myfun(x):
if x == 'No':
return 0
elif x == 'DSL':
return 1
else:
return 2
Data['Internet Service'] = Data['Internet Service'].apply(lambda x: myfun(x)).astype(int)
del myfun
Since we have already included No interent service in InternetService, we can code the rest as,
\begin{cases} 0 &\mbox{No, No internet service}\\ 1 &\mbox{Yes}\end{cases}Data[Temp] = Data[Temp].applymap(lambda x: 1 if x =='Yes' else 0).astype(int)
Since, there is already a feature as PhoneService, for MultipleLines, we can try $$ \mbox{MultipleLines} = \begin{cases} 0 &\mbox{No, No phone service}\\ 1 &\mbox{Yes}\end{cases} $$
Data['Multiple Lines'] = Data['Multiple Lines'].map(lambda x: 1 if x =='Yes' else 0).astype(int)
Data_types = dtypes_group(Data)
Temp = Data_types.loc[Data_types.index == 'object'].values[0,0]
print('Columns: %s' %', '.join(Temp))
col_details('Contract')
Data['Contract'] = Data['Contract'].replace({'Month-to-month':0, 'One year':1, 'Two year':2}).astype(int)
Data['Gender'] = Data['Gender'].map(lambda x: 1 if x =='Male' else 0).astype(int)
col_details('Payment Method')
In this case, we can not rank these values. Therefore,
Data = Data.join(pd.get_dummies(Data['Payment Method']).astype(int))
Data = Data.drop(columns = ['Payment Method'])
Data_types = dtypes_group(Data)
display(Data_types)
Temp = Data_info(Data, Only_NaN = True)
Temp = Temp.index.tolist()
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Data[Temp] = imp.fit_transform(Data[Temp])
Let's take a look at the variance of the features.
display(Data.drop(columns = ['Churn']).var().sort_values(ascending = False).to_frame(name= 'Variance')\
.style.background_gradient(cmap=sns.light_palette("green", as_cmap=True)).set_precision(2))
def Correlation_Plot (Df,Fig_Size):
Correlation_Matrix = Df.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})
Correlation_Plot (Data, 16)
Correlations of features with customer Churn.
Temp = Data.corr().round(2)
Temp['Churn'].sort_values().to_frame(name= 'Correlation')[:-1].style.background_gradient(cmap='RdYlGn',
subset=['Correlation']).set_precision(2)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize = (16, 6))
Temp = ['Tenure','Monthly Charges','Total Charges']
for i in range(len(Temp)):
_ = sns.distplot(Data[Temp[i]],
fit=norm, kde=False, color='seagreen', ax= ax[i])
df = Data.drop(columns = ['customer ID'])
Target = 'Churn'
X = df.drop(columns = [Target])
y = df[Target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Furthermore, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
scaler = StandardScaler()
X_train_STD = scaler.fit_transform(X_train)
X_test_STD = scaler.transform(X_test)
X_train_STD = pd.DataFrame(data = X_train_STD, columns = X_train.columns)
X_test_STD = pd.DataFrame(data = X_test_STD, columns = X_test.columns)
A number of functions that we would use.
def Performance(clf, X_test = X_test_STD):
df = pd.DataFrame()
y_pred = clf.predict(X_test)
df = df.append({'Score': clf.score(X_test, y_test),
'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
'Recall Score': recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
display(df.style.hide_index())
def highlight_max(s):
is_max = s == s.max()
return ['background-color: SpringGreen' if v else '' for v in is_max]
def Feature_Ranking(clf):
df = pd.DataFrame()
for n in range(2, X.shape[1]):
selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
selector.fit(X_train_STD, y_train)
df = df.append({'Number of Features to Select': n,
'Score':metrics.accuracy_score(y_test, selector.predict(X_test_STD)),
'Features': X.columns[selector.support_].tolist(),
'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)
df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
display(df.style.apply(highlight_max, subset=['Score']))
return df.loc[df.Score == df.Score.max(), 'Features'].values[0]
def ROC_Curve(clf, X_test = X_test_STD):
# false positive rates, true positive rates and thresholds
fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])
fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
_ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
_ = ax.plot([0, 1], [0, 1],'r--', lw=2)
_ = ax.legend(loc = 'lower right', fontsize = 14)
_ = ax.set_xlim([0,1])
# _ = ax.set_ylim([0,1])
_ = ax.set_xlabel('False Positive Rate (FPR)')
_ = ax.set_ylabel('True Positive Rate (TPR)')
For this set of data, we would like to implement Random Forest Classifier. which creates a set of decision trees from a randomly selected subset of the training set. It benefits from voting of different decision trees for classifying the final outputs.
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD,y_train)
Performance(rfc)
ROC_Curve(rfc)
However, we only need to implement features that are useful for classifications. In this article, we demonstrated the importance of feature ranking.
Best_Features = Feature_Ranking(rfc)
Thus, the best features:
print(Best_Features)
rfc = RandomForestClassifier()
_ = rfc.fit(X_train_STD[Best_Features],y_train)
Performance(rfc, X_test_STD[Best_Features])
ROC_Curve(rfc, X_test_STD[Best_Features])
The main classification metrics:
pd.DataFrame(classification_report(y_test,rfc.predict(X_test[Best_Features]),
output_dict = True, target_names = ['No_Churn','Churn']))
A confusion matrix allows the visualization of the performance of a classification model.
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
_ = plot_confusion_matrix(rfc, X_test_STD[Best_Features], y_test, display_labels= ['No_Churn','Churn'],
cmap= "Greens", normalize= 'true', ax = ax)
_ = ax.set_title('Normalized Confusion Matrix')
Finally, customer churn can be predicted using our model for the test data.
display(pd.DataFrame(list(rfc.predict_proba(X_test_STD[Best_Features].values)), columns=['No_Churn','Churn']))